import numpy as np
from sklearn.preprocessing import StandardScaler

def gen_lin_data(d: int, synth_gauss: dict):
    '''
    This method generates data from the synthetic Gaussian data
    generating process described in the writeup. This is also the same
    synthetic data generating process as in Pawelczyk et al.
    Args:
        d: data dimension (int)
        snyth gauss: Gaussian model parameters
    Returns: X, Y (np.arrays); training set size (int)
    '''
    n = synth_gauss['n']
    n_train = int(n / 2)
    
    sigma2 = synth_gauss['sigma2']
    frac_important_features = synth_gauss['frac_important_features']
    # number of important features (i.e. dimensions)
    d_relevant = int(np.floor(d * frac_important_features))
    mu = np.zeros(d)
    Sigma2 = np.ones((d, d)) * synth_gauss['corr'] + np.eye(d) * sigma2
    
    ### true parameter vector ###
    beta0 = np.zeros(d)
    # for each dimension, we randomly sample from the d-dimensional
    # hypercube [-1, 1]^d to "center" values of that feature
    beta = np.random.random(d) * 2 - 1
    quantile = np.quantile(np.abs(beta), 1 - frac_important_features)  # arg1: distribution, arg2: frac to take quantile of
    indices = np.where(np.abs(beta) >= quantile)[0]  # determine indices of relevant coefficients
    
    beta0[0:d_relevant] = beta[indices]
    # normalize beta0 into a unit vector
    beta1 = beta0 / np.linalg.norm(beta0, ord=2)
    
    # error variance parameters
    sigma2_eps = synth_gauss['sigma2_eps']
    sigma_eps = np.sqrt(sigma2_eps)
    eps = np.random.normal(0, sigma_eps, n)
    
    # generate data
    X = np.random.multivariate_normal(mu, Sigma2, n)
    # standardize generated data
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    # geerate labels
    score = X @ beta1 + eps # score = X * beta + eps
    prob = 1 / (1 + np.exp(-score))
    # final label
    Y_disc = (prob > 0.5) * 1
    
    return X, Y_disc
